# 一种简单的数据分析的算法，是基于加权平均的热度计算公式：
# 热度 = w1 × 点赞数 + w2 × 评论数 + w3 × 收藏数
# 其中，w1，w2，w3为权重系数，
# 这个算法比较简单，但也有一些局限性，比如没有考虑文章的发布时间，阅读量，评论的质量等因素。

import os
import re
from collections import Counter

import pandas as pd


class InfluenceStatistical:
    def __init__(self):
        self.hotRank = []
        self.my_BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        # csv_path：读取的csv文件(爬虫产物)
        self.csv_path = os.path.join(self.my_BASE_DIR, 'static', 'csv_collect', 'deep_articles.csv')
        # self.csv_path = os.path.join(self.my_BASE_DIR, 'static', 'csv_collect', 'deep_articles.xlsx')

    def statistical(self):
        # 读取数据
        df = pd.read_csv(self.csv_path, sep=",", header=0, encoding="utf-8")

        # 清洗数据 生成目标列表
        for i in range(len(df)):
            self.extract_hotRank(
                df["title"][i],
                df["url"][i],
                df["likes"][i],
                df["stars"][i],
                df["comments"][i],
                df["author"][i],
                df["author_url"][i],
                df["category"][i],
                df["time"][i],
            )

        # 排序一下
        self.hotRank = sorted(self.hotRank, key=lambda x: x["hotness"], reverse=True)
        # 提取前一百条数据给前端
        self.hotRank = self.hotRank[0:100]
        # print(self.hotRank)

        return self.hotRank



    # 定义一个函数，将字符串转换成数字
    def str_to_num(self, s):
        # 定义一个字典，存储不同单位对应的倍数
        units = {'k': 1000, 'w': 10000}
        # 使用正则表达式匹配字符串中的数字部分和单位部分
        match = re.match(r'(\d+\.?\d*)(\w?)', s)
        # 如果匹配成功，提取数字部分和单位部分
        if match:
            num = float(match.group(1))
            unit = match.group(2)
            # 如果单位部分在字典中，乘以相应的倍数
            if unit in units:
                num *= units[unit]
            # 返回数字
            return num
        # 如果匹配失败，返回None
        else:
            return None

    def extract_hotRank(self, title, url, likes, stars, comments, author, author_url, category, time):
        # 给定的字符串可能为NOTFOUND或者空字符
        if "" != likes and "NOTFOUND" != likes:
            if "" != stars and "NOTFOUND" != stars:
                if "" != comments and "NOTFOUND" != comments:

                    # 给的字符可能有k或者w
                    if "k" in likes or "w" in likes:
                        likes = self.str_to_num(likes)
                    if "k" in stars or "w" in stars:
                        stars = self.str_to_num(stars)
                    if "k" in comments or "w" in comments:
                        comments = self.str_to_num(comments)

                    # 开始计算
                    # 定义权重系数
                    w1 = 0.3  # 点赞权重
                    w2 = 0.5  # 收藏权重
                    w3 = 0.2  # 评论权重
                    # 计算热度
                    hotness = w1 * int(likes) + w2 * int(stars) + w3 * int(comments)
                    temp_dict = {
                        "title": title,
                        "url": url,
                        "hotness": hotness,
                        "author": author,
                        "author_url": author_url,
                        "category": category,
                        "time": time,
                    }
                    self.hotRank.append(temp_dict)

# 以脚本方式启动
if __name__ == "__main__":
    # 捕捉异常错误
    try:
        ins = InfluenceStatistical()
        ins.statistical()
    except Exception as e:
        print("错误:", e)
